##################################################################
##################################################################
## Replication Material
## Stefan Müller: The Temporal Focus of Campaign Communication
## The Journal of Politics
## stefan.mueller@ucd.ie
##
## Script 8: Results reported in SI Section F
##################################################################
##################################################################


# Note: The file description_replication_material_jop_mueller.pdf describes the purpose of this 
# file in detail and lists the names and sources of all datasets 
# used in this script

# This script was run on the following R version, platform and OS:
# R version 3.6.0 (2019-04-26)
# Platform: Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS Catalima 10.15.5


# load packages required to run this script
library(dplyr)               # CRAN v1.0.0
library(ggplot2)             # CRAN v3.3.2
library(quanteda)            # CRAN v2.0.1
library(quanteda.textmodels) # CRAN v0.9.1
library(scales)              # CRAN v1.1.1
library(car)                 # CRAN v3.0-7
library(foreign)             # CRAN v0.8-76
library(Hmisc)               # CRAN v4.4-0
library(foreign)             # CRAN v0.8.76

# create custom ggplot2 scheme
theme_baser <- function (){
  theme_minimal()  %+replace%
    theme(panel.grid.minor.x = element_blank(),
          panel.grid.minor.y = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.border = element_rect(fill = NA,color = "black", size = 0.5,
                                      linetype = "solid"),
          legend.title = element_text(size = 15),
          plot.title = element_text(size = 15, face = "italic",
                                    vjust = 1.5, hjust = 0,
                                    margin=margin(0, 0, 12 ,0)),
          legend.position = "bottom",
          axis.ticks = element_line(size = 0.3),
          axis.ticks.length = unit(0.2, "cm"),
          legend.text=element_text(size = 13),
          strip.text = element_text(size = 15, hjust = 0.5,
                                    margin = margin(b = 5, r = 5, l = 5, t = 5)),
          axis.text = element_text(colour = "black", size = 13),
          axis.title = element_text(size = 13, hjust = 0.5))
}

# set theme
theme_set(theme_baser())

# analyse Irish budget debates

# increase speed by using more threads (not necessary - code also runs without this command)
quanteda_options("threads" = 10)

# load data (from Proksch et al. 2019 replication material):
# https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/ALFLK6

load("prokschetal_3_master_data.Rdata")
load("prokschetal_3_positions_before_senti.Rdata")

# load English dataset with human-annotated sentences
dat_full_en <- readRDS("data_sentences_classified_english.rds")

# get document-level variables
data <- docvars(budgetCorpus)

# recode parties (code from Proksch et al 2019 replication material)
data$partyAbbrev <- NA
data$partyAbbrev[data$party_name=="Democratic Left"] <- "DL"                                    
data$partyAbbrev[data$party_name=="Democratic Socialist Party"] <- "DSP"
data$partyAbbrev[data$party_name=="Fianna Fáil"] <- "FF"
data$partyAbbrev[data$party_name=="FF"] <- "FF"
data$partyAbbrev[data$party_name=="Fine Gael"] <- "FG"
data$partyAbbrev[data$party_name=="Green Party"] <- "GRE"
data$partyAbbrev[data$party_name=="Independent"] <- "Indp"
data$partyAbbrev[data$party_name=="Independent Fianna Fáil"] <- "Indp"
data$partyAbbrev[data$party_name=="Other"] <- "Other"
data$partyAbbrev[data$party_name=="People Before Profit Alliance"] <- "PBPA"
data$partyAbbrev[data$party_name=="Progressive Democrats"] <- "PD"
data$partyAbbrev[data$party_name=="Sinn Féin"] <- "SF"
data$partyAbbrev[data$party_name=="Socialist Party"] <- "SP"
data$partyAbbrev[data$party_name=="Socialist Party 2011"] <- "SP"
data$partyAbbrev[data$party_name=="Labour Party"] <- "LAB"
data$partyAbbrev[data$party_name=="The Labour Party"] <- "LAB"
data$partyAbbrev[data$party_name=="The Workers' Party"] <- "WP"
data$partyAbbrev[data$party_name=="Workers and Unemployed Action Group South-Tipperary"] <- "WUAG"



# Code parties in government and opposition
data$govt <- 0

# 1983 - 1986: Fine Gael + Labour
data$govt[(data$debate_year>=1983 & data$debate_year<=1986) & (data$partyAbbrev=="LAB" | data$partyAbbrev=="FG")] <- 1

# 1987 - 1989: Fianna Fail
data$govt[(data$debate_year>=1987 & data$debate_year<=1989) & data$partyAbbrev=="FF"] <- 1

# 1990 - 1992: Fianna Fail + Progressive Democrats
data$govt[(data$debate_year>=1990 & data$debate_year<=1992) & (data$partyAbbrev=="FF" | data$partyAbbrev=="PD")] <- 1

# 1993 - 1994: Fianna Fail + Labour
data$govt[(data$debate_year>=1993 & data$debate_year<=1994) & (data$partyAbbrev=="FF" | data$partyAbbrev=="LAB")] <- 1

# 1995 - 1997: Fianna Gael + Labour + Democratic Left
# (use budget year for 1997 because there were two debates in 1997)
data$govt[(data$debate_year>=1995 & data$budget_year<=1997) & (data$partyAbbrev=="FG" | data$partyAbbrev=="LAB" | data$partyAbbrev=="DL")] <- 1

# 1998 - 2006: Fianna Fail + Progressive Democrats
data$govt[(data$budget_year>=1998 & data$debate_year<=2006) & (data$partyAbbrev=="FF" | data$partyAbbrev=="PD")] <- 1

# 2007 - 2010: Fianna Fail + Green + Progressive Democrats
data$govt[(data$debate_year>=2007 & data$debate_year<=2010) & (data$partyAbbrev=="FF" | data$partyAbbrev=="PD" | data$partyAbbrev=="GRE")] <- 1

# 2011 - 2013: Fine Gael + Labour
data$govt[(data$debate_year>=2011 & data$debate_year<=2013) & (data$partyAbbrev=="FG" | data$partyAbbrev=="LAB")] <- 1

table(data$govt)

# remove old docvars
docvars(budgetCorpus) <- NULL

# create new corpus and add new docvars
data_corpus_budget <- budgetCorpus

docvars(data_corpus_budget) <- data

# check how often the cheann_comhairle (deputy chairperson) is speaking
table(docvars(data_corpus_budget, "cheann_comhairle"))

# only select relevant speeches
data_corpus_budget_clean <- data_corpus_budget %>% 
  corpus_subset(cheann_comhairle == FALSE) %>% # remove deputy chairperson
  corpus_subset(leas_cheann_comhairle == FALSE) %>% # remove deputy chairperson
  corpus_subset(budget_year != 0)

table(docvars(data_corpus_budget_clean, "cheann_comhairle"))

ndoc(data_corpus_budget) - ndoc(data_corpus_budget_clean)

# reshape corpus to sentences
data_corpus_budget_sentences <- data_corpus_budget_clean %>% 
  corpus_reshape(to = "sentences")


# create tokens object with training dataset
toks_train <- dat_full_en %>% 
  corpus() %>% 
  tokens() %>% 
  tokens_remove(pattern = phrase(paste(unique(data$party_name), "*"))) # remove party names from training set

# create dfm for training
dfmat_train <- dfm(toks_train, remove_punct = TRUE)

                             # create dfm of speeches
dfmat_test <- data_corpus_budget_sentences %>% 
  tokens() %>% 
  dfm(remove_punct = TRUE)    


# run SVM
tmod_svm <- textmodel_svm(dfmat_train, docvars(dfmat_train, "class"))

# predict sentences from speeches
pred_tmod_svm <- predict(tmod_svm, dfmat_test, force = TRUE)

# summary statistics of class
table(pred_tmod_svm)
prop.table(table(pred_tmod_svm))

# apply sentiment dictionary
sent_dfmat_test <- data_corpus_budget_sentences %>% 
  tokens() %>% 
  tokens_remove(pattern = "ireland*") %>% # would otherwise be scored as negative
  tokens_lookup(dictionary = data_dictionary_LSD2015) %>% 
  dfm() %>% 
  convert(to = "data.frame")

# get document-level variables
dat_classified <- docvars(data_corpus_budget_sentences)

# bind sentiment and document-level variables
dat_classified <- bind_cols(sent_dfmat_test, dat_classified)

# assign class and number of tokens per sentence
dat_classified$class <- pred_tmod_svm
dat_classified$ntoken <- ntoken(data_corpus_budget_sentences, remove_punct = TRUE)

# estimate sentiment per speaker and debate year
dat_classified <- dat_classified %>% 
  mutate(incumbency_status2_factor = ifelse(govt == 1, "Government", 
                                            ifelse(govt == 0, "Opposition", NA))) %>% 
  filter(budget_year >= 1997) %>% 
  mutate(boom_crisis = ifelse(budget_year < 2008, "Boom (1997-2007)", ifelse(budget_year >= 2008, "Crisis (2008-2013)", NA))) %>% 
  filter(ntoken > 5) # only sentences with five tokens

dat_classified$incumbency_status2_factor <- factor(dat_classified$incumbency_status2_factor)

dat_classified <- filter(dat_classified, budget_year != 0)



# formula for sentiment aggregation
sent_log <- function(positive, negative, neg_negative, neg_positive, 
                     negations = FALSE) {
    if (negations == FALSE) {
        log((sum(positive) + 0.5) / (sum(negative) + 0.5))
    }
    
    else {
        log((sum(positive) + sum(neg_negative, na.rm = TRUE) + 0.5) / (sum(negative) + sum(neg_positive, na.rm = TRUE) + 0.5))
        
    }
}

# summarise sentiment per speech
dat_classified_sent_speech <- dat_classified %>% 
  group_by(budget_year, month, day, boom_crisis, partyAbbrev, govt, incumbency_status2_factor, 
           finance_minister, opposition_spokesperson, 
           last_name, first_name, class) %>% 
  summarise(sent_lsd_log = sent_log(positive, negative, 
                                    neg_negative = neg_negative,
                                    neg_positive = neg_positive,
                                    negations = TRUE)) %>% 
  mutate(speaker_year = paste(budget_year, last_name, first_name, sep = "_"))


# get proportions of sentences in each class
dat_classified_prop <- dat_classified %>% 
  group_by(budget_year, boom_crisis, incumbency_status2_factor, class) %>% 
  summarise(n_sentences = n(),
            n_words = sum(ntoken)) %>%
  mutate(relfreq_sentences = n_sentences / sum(n_sentences),
         relfreq_words  = n_words / sum(n_words))


dat_classified_prop$class <- factor(dat_classified_prop$class,
                                    levels = c("Past", "Present", "Future"))

# Figure A33 ----
ggplot(dat_classified_prop,
       aes(x = incumbency_status2_factor, 
           y = relfreq_sentences,
           colour = boom_crisis)) +
  geom_boxplot(outlier.colour = "white",  position = position_dodge(width = 0.8)) +
  geom_point(size = 1, alpha = 0.5, position = position_dodge(width = 0.8)) + 
  facet_wrap(~class) +
  scale_colour_manual(values = c("darkgreen", "red", "blue")) +
  facet_wrap(~class) +
  labs(x = NULL, y = "Temporal focus") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1),
                     limits = c(0, 0.6), breaks = c(seq(0, 0.6, 0.2))) +
  theme(legend.position = "right",
        legend.title = element_blank())
ggsave("fga33.pdf", width = 10, height = 4)


# bootstrap sentiment by year, incumbency status, and class
set.seed(214)
dat_classified_sent_speech_boot <- dat_classified_sent_speech %>% 
  group_by(budget_year, boom_crisis, incumbency_status2_factor, class) %>% 
  do(data.frame(rbind(Hmisc::smean.cl.boot(.$sent_lsd_log))))


dat_classified_sent_speech_boot <- filter(dat_classified_sent_speech_boot, budget_year > 0)

dat_classified_sent_speech_boot$class <- factor(dat_classified_sent_speech_boot$class,
                                                levels = c("Past", "Present", "Future"))

# Figure A34 ----
ggplot(dat_classified_sent_speech_boot, aes(x = budget_year, y = Mean, 
                                            ymin = Lower, ymax = Upper,
                                            shape = incumbency_status2_factor,
                                            colour = incumbency_status2_factor)) + 
  geom_pointrange() +
  geom_smooth() + 
  geom_vline(xintercept = 2008, colour = "grey60", linetype = "dashed") +
  scale_colour_manual(values = c("black", "red")) +
  scale_shape_manual(values = c(16, 1)) +
  annotate("text", x = 2003, y = 1.7, label = "Boom", colour = "grey60",
           size = 5) + 
  annotate("text", x = 2011, y = 1.7, label = "Crisis", colour = "grey60",
           size = 5) +
  facet_wrap(~class) +
  scale_x_continuous(limits = c(1997, 2013), breaks = c(seq(1997, 2013, 3))) +
  labs(x = "Year of budget debate", y = "Sentiment (Proksch et al. 2019)") +
  theme(legend.position = "bottom",
        legend.title = element_blank())
ggsave("fga34.pdf", width = 10, height = 6)


# analysis of 2013 TV debate

# load content analysis of TV debate 
# file can be downloaded for free from the GESIS website (registration required)
# https://dbk.gesis.org/dbksearch/sdesc2.asp?no=5710&db=e&doi=10.4232/1.13203

dat_2013 <- foreign::read.dta("../data_notshare/ZA5710_v2-2-0.dta")

# only filter politicians' statements and recode some variables
dat_2013_politicians <- dat_2013 %>% 
  mutate(speaker = v10,
         tense = v19,
         sentiment = v28) %>% 
  filter(speaker %in% c("Merkel, Angela", "Steinbrueck, Peer")) %>% 
  mutate(tense = car::recode(tense, "'Vergangenheit'='Past';'Gegenwart/Allgemeingueltigkeit'='Present';'Zukunft'='Future';else='Other'")) %>%
  mutate(sentiment = car::recode(sentiment, "'Ja'='Negative'; 'Nein'='Positive';else='Neutral'")) %>% 
  filter(tense != "Other") %>% 
  mutate(speaker_inc = ifelse(speaker == "Merkel, Angela", "Merkel (Incumbent)", "Steinbrück (Opposition)"))


# get proportion of statements about the past, present, and future by speaker
dat_2013_sum_class <-  dat_2013_politicians %>% 
  group_by(speaker_inc, tense) %>% 
  summarise(n_sentences = n()) %>%
  mutate(relfreq_sentences = n_sentences / sum(n_sentences))

dat_2013_sum_class$tense <- factor(dat_2013_sum_class$tense,
                                   c("Past", "Present", "Future"))

# Figure A35 ----
ggplot(dat_2013_sum_class, aes(x = tense, y = relfreq_sentences)) + 
  geom_bar(stat = "identity") +
  facet_wrap(~speaker_inc) +
  geom_text(aes(label = n_sentences), nudge_y = -0.1, size = 5, colour = "white") +
  labs(x = NULL, y = "Percentage")
ggsave("fga35.pdf", width = 10, height = 4)


# get relative frequencies of each class and sentiment in debates
dat_2013_sum <- dat_2013_politicians %>% 
  group_by(speaker_inc, tense, sentiment) %>% 
  summarise(n_sentences = n()) %>%
  mutate(relfreq_sentences = n_sentences / sum(n_sentences))

dat_2013_sum$tense <- factor(dat_2013_sum$tense,
                             levels = c("Past", "Present", "Future"))

dat_2013_sum <- dat_2013_sum %>% 
  ungroup() %>% 
  mutate(speaker_inc = str_replace_all(speaker_inc, " \\(", "\\\n\\("))


# Figure A36 ----
ggplot(dat_2013_sum, aes(x = speaker_inc, 
                         y = relfreq_sentences, 
                         fill = sentiment)) + 
  geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.6) +
  facet_wrap(~tense) +
  scale_fill_manual(values = c("red", "grey", "darkgreen")) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  theme(legend.position = "bottom", legend.title = element_blank()) + 
  labs(x = NULL, y = "Percentage of statements\nwithin each class")
ggsave("fga36.pdf", width = 10, height = 4)
